import networkx as nx
import pandas as pd
import pickle as pkl
import numpy as np
from numpy.random import RandomState # random_state for networkX only for python3.6
rng = RandomState(787351)
import os
DATA_DIR = '../../data/data_schoolofinf'
# Graphing/Image
SAVE_GRAPHS = True
import seaborn as sns
%matplotlib inline
import matplotlib.pyplot as plt
plt.style.use(['seaborn-poster'])
import logging
logging.basicConfig(
format='%(asctime)s : %(levelname)s : %(message)s', level=logging.INFO)
from network_artist import *
from infnet_helper import *
In this notebook, we create and visualise three different models of the informatics network:
To create the informatics collaboration network, we need to do additional processing to the lookup tables (dataframe) for poinf and pub.
lookup_pub¶Filter the list of publications, limiting the dataset to a period from 1997-2017. Further constraints will be carried out later for each model.
# simply use the function:
lookup_pub = get_lookup_pub(1997,2017)
lookup_pub.head(3)
lookup_pub.info()
We can add the year that an individual makes his/her first publication. This would allow us to create a temporal evolution of the network graph.
lookup_poinf¶_lookup_poinf = get_lookup_poinf()
# using the function instead:
lookup_poinf = get_lookup_poinf()
# remove individuals who does not have any publication (i.e. first_pub_year == 0)
lookup_poinf.dropna(axis=0, subset=['first_pub_year'], inplace=True)
lookup_poinf = lookup_poinf[(lookup_poinf.total_pub > 0)
& (lookup_poinf.total_pub_1997 > 0)]
lookup_poinf.info()
no_poinf = len(lookup_pub[lookup_pub.nb_poinf == 0])
ratio_ = no_poinf / len(lookup_pub)
print(no_poinf, ratio_)
one_poinf_only = len(lookup_pub[lookup_pub.nb_poinf == 1])
ratio_ = one_poinf_only / len(lookup_pub)
print(one_poinf_only, ratio_)
with_poinf_infnet20yr = len(lookup_pub[lookup_pub.nb_poinf.ge(2)])
print((with_poinf_infnet20yr, len(lookup_pub)))
print((with_poinf_infnet20yr / len(lookup_pub)))
lookup_pub_infnet6yr = lookup_pub[lookup_pub.year.ge(2012)]
with_poinf_6yr = len(
lookup_pub_infnet6yr[lookup_pub_infnet6yr.nb_poinf.ge(2)])
print((with_poinf_6yr, len(lookup_pub_infnet6yr)))
print((with_poinf_6yr / len(lookup_pub_infnet6yr)))
x = lookup_pub[lookup_pub.nb_poinf.ge(4)]
x[x.nb_poinf == x.nb_authors]
institutes information¶Class labels for each institute
INSTITUTES = get_institute()
# separate the individuals by community:
gb = lookup_poinf.groupby('institute_class')
# Number of individuals in each classes:
for k, group in gb:
className = [name for (name, _k) in list(INSTITUTES.items()) if _k == k][0]
print(('class {}-{}: {}'.format(k, className, len(group))))
The class distribution is uneven! Classes 1, 2, 3, 4, 5, 6, and 7 are the institutes in the school of informatics. Additional classes are found, but they are relatively small, except for the UNKNOWN class which contains individuals who's institutes are not present.
print(('Number of individuals in informatics:', len(lookup_poinf)))
This section, we only consider a simple undirected graph of the informatics collaboration network
all_edges = []
for listEdges in lookup_pub.edges:
all_edges.extend(listEdges)
# Since the graph of interest is simple ud, we remove all repeated edges
unique_edges = set()
for (i1, i2) in all_edges:
assert i1 != i2, "SELF LOOPS DETECTED!"
# We are only interested in an simple undirected graph!
# if the same edge exists, we can ignore
if (i2, i1) in unique_edges:
continue
else:
# adding another (i1,i2) into the set will only count as one
unique_edges.add((i1, i2))
print('number of unique edge pairs: ', len(unique_edges))
# Filter all edges pairs if there is only one or none individuals from school of informatics
poinf_edges_only = [(au1, au2) for (au1, au2) in unique_edges
if au1 in lookup_poinf.index and au2 in lookup_poinf.index]
print('number of unique edge pairs (informatics only): ', len(poinf_edges_only))
infnet-20yr¶g_poinf_only: the informatics collaboration graph from 1997-2017
g_infnet20yr = nx.from_edgelist(poinf_edges_only)
print(nx.info(g_infnet20yr))
g_infnet20yr_nodeorder = sorted(list(g_infnet20yr.nodes))
adjmat_infnet20yr = nx.adj_matrix(g_infnet20yr,nodelist=g_infnet20yr_nodeorder)
adj_mat, fig, order = create_adj_mat(
g_infnet20yr, g_infnet20yr_nodeorder, draw=True, use_order=True)
if SAVE_GRAPHS:
plt.savefig(
'IMG/infnet20yr_adj_mat.png',
format='png',
transparent=True,
bbox_inches='tight')
adj_mat.dump(os.path.join(DATA_DIR, 'mat', 'infnet20yrs-adj-mat.pkl'))
# Save the list of individuals in the graph:
with open(os.path.join(DATA_DIR, 'poinf_collabgraph_1997-2017.txt'), 'w') as f:
f.write("\n".join(list(g_infnet20yr_nodeorder)))
NOTE: The number of individuals with a publication is expected to by 228; however, in our graph, only 194 nodes are present. Why? Because we filtered the graph according to the edges where both are individuals are present.
Here, we illustrate each network using different layouts
Spring Layout overall¶# this is the pos for the entire network! we can use this to maintain the visualiation across different experiements
pos_full = nx.kamada_kawai_layout(g_infnet20yr)
f = plt.figure(figsize=(8, 8))
ax = f.add_subplot(111)
ax.axis('off')
nx.draw(
g_infnet20yr,
pos=pos_full,
ax=ax,
node_size=50,
alpha=.8,
edge_color='#999966',
node_color=color_by_inst(g_infnet20yr, lookup_poinf))
f.savefig('IMG/infnet20yr_spring.png',format='png',bbox_inches='tight')
Circular Layout for each class¶draw_default_layout(
g_infnet20yr, lookup_poinf, file_prefix='infnet20yr', with_weight=False)
Shell Layout¶# visualise:
fig = plt.figure(figsize=(8, 8))
ax = fig.add_subplot(111)
ax.axis('off')
nx.draw_networkx(
g_infnet20yr,
pos=nx.shell_layout(g_infnet20yr, get_default_nlist(g_infnet20yr,lookup_poinf,as_dict=False)),
with_labels=False,
ax=ax,
edge_color='#999966',
node_size=40,
node_color=color_by_inst(g_infnet20yr, lookup_poinf))
if SAVE_GRAPHS:
# ax.set_title('Informatics Collaboration Network from 1997-2017')
# ax = add_inst_labels(ax) #You can include the label by uncommenting `ax = add_inst_label(ax)`
#plt.savefig("IMG/infnet20yr_shell.pdf", format='pdf', bbox_inches="tight")
plt.savefig("IMG/infnet20yr_shell.png", format='png', bbox_inches="tight")
Circular Layout¶ordered according to institute
draw_circular_layout(g_infnet20yr, lookup_poinf, file_prefix='infnet20yr')
3D network graph¶import igraph as ig
from plotly.graph_objs import *
# from plotly.offline import download_plotlyjs, init_notebook_mode, plot, iplot
import plotly.offline as offline
offline.init_notebook_mode()
igraph_G = ig.Graph.TupleList(g_infnet20yr.edges)
layt = igraph_G.layout('kk', dim=3)
vs = igraph_G.vs
TRACES = []
_data = {}
# VERTICES
for _id in g_infnet20yr.nodes:
_idx = vs.find(_id).index
_k = lookup_poinf.loc[_id].institute_class # institute_class of individual
if _k in list(_data.keys()):
_data[_k]['Xn'].append(layt[_idx][0]) # x-coordinates
_data[_k]['Yn'].append(layt[_idx][1]) # y-coordinates
_data[_k]['Zn'].append(layt[_idx][2]) # z-coordinates
_data[_k]['ids'].append(_id)
else:
_data[_k] = {
'Xn': [layt[_idx][0]],
'Yn': [layt[_idx][1]],
'Zn': [layt[_idx][2]],
"ids": [_id]
}
# Scatter nodes
for k in sorted(_data.keys()):
v = _data[k]
labels = lookup_poinf.loc[v['ids']].full_name.tolist()
_trace = Scatter3d(
x=v['Xn'],
y=v['Yn'],
z=v['Zn'],
mode='markers',
marker=Marker(
symbol='dot',
size=6,
color=inst_by_color[k],
line=Line(color='rgb(50,50,50)', width=0.5)),
text=labels,
hoverinfo='text',
showlegend=True,
name=[n for (n, _k) in list(INSTITUTES.items()) if _k == k][0])
TRACES.append(_trace)
# Scatter edges
# EDGES
Xe, Ye, Ze = [], [], []
vs = igraph_G.vs
for (a, b) in g_infnet20yr.edges:
_a_idx = vs.find(a).index
_b_idx = vs.find(b).index
# x-coordinates of edge ends
Xe += [layt[_a_idx][0], layt[_b_idx][0], None]
Ye += [layt[_a_idx][1], layt[_b_idx][1], None]
Ze += [layt[_a_idx][2], layt[_b_idx][2], None]
trace1 = Scatter3d(
x=Xe,
y=Ye,
z=Ze,
mode='lines',
line=Line(color='rgb(125,125,125)', width=2),
hoverinfo="none",
showlegend=False)
TRACES.append(trace1)
axis = dict(
showbackground=False,
showline=False,
zeroline=False,
showgrid=False,
showticklabels=False,
title="")
layout = Layout(
title="Informatics Collaboration Network from 1997-2017",
width=1000,
height=1000,
showlegend=True,
legend=dict(x=0, y=0, xanchor='auto', yanchor='auto'),
scene=Scene(
xaxis=XAxis(axis),
yaxis=YAxis(axis),
zaxis=ZAxis(axis),
),
margin=Margin(t=50),
# hovermode="x",
# xaxis={"range":[1997,2017], 'title':'Year'},
# sliders={
# 'args':[
# 'trainsition', {
# 'duration': 400,
# 'easing':'cubic-in-out'
# }
# ],
# 'initialValue':'1997',
# 'plotlycommand':'animate',
# 'values':years,
# 'visible':True
# }
# annotations=Annotations([
# Annotation(
# showarrow=False,
# text='Colored by institutes',
# xref='paper',
# yref='paper',
# x=0,
# y=0.1,
# xanchor='left',
# yanchor='bottom',
# font=Font(size=14))
# ]),
)
data=Data(TRACES)
fig=Figure(data=data, layout=layout)
offline.iplot(fig)
In this section we visualise each of the seven institutes in School of Informatics
gb_dept = lookup_poinf.groupby('institute_class')
# visualise:
fig = plt.figure(figsize=(9, 80))
ax = fig.add_subplot(8, 1, 1)
ax.axis('off')
ax.set_title(
'School of Informatics Collaboration Network\n{} individuals\nAverage degree = {:.3f}'.
format(
len(g_poinf_only),
(sum(dict(g_poinf_only.degree).values()) / len(g_poinf_only))))
nx.draw_networkx(
g_poinf_only,
pos=pos,
with_labels=False,
ax=ax,
node_size=40,
node_color=color_by_inst(g_poinf_only))
ks = [1, 2, 3, 4, 5, 6, 7]
for k in ks:
inst = [name for (name, _k) in list(institutes.items()) if _k == k][0]
ax = fig.add_subplot(8, 1, k + 1)
ax.axis('off')
individuals = gb_dept.get_group(k).index
g = g_poinf_only.subgraph(individuals)
nx.draw_networkx(
g,
pos=nx.spring_layout(g),
with_labels=False,
ax=ax,
node_size=40,
nodelist=g.nodes,
node_color=color_by_inst(g))
title = "{}\n{} individuals\nAverage degree = {:.3f}".format(
inst, len(g),
sum(dict(g.degree).values()) / len(g))
ax.set_title(title)
if SAVE_GRAPHS:
plt.savefig("IMG/infnet20yr_strat_department.pdf", format='pdf', bbox_inches="tight")
infnet every 5 years¶We can now observe the evolution of the informatics network on a time scale of 5 years
years = np.linspace(1997, 2017, num=5, dtype=int)
years
fig = plt.figure(figsize=(50, 10))
i = 1
for yr in years[1:]:
_lookup_poinf = lookup_poinf[lookup_poinf.total_pub_1997>0]
df = _lookup_poinf.drop(
_lookup_poinf[_lookup_poinf.first_pub_year > yr].index)
individuals = list(df.index)
g = g_infnet20yr.subgraph(individuals)
ax = fig.add_subplot(1, 4, i)
# ax.set_title('End of {}'.format(yr))
ax.axis('off')
i += 1
# calculate the shell layout
nlist = [[] for i in range(12)]
for node in g:
c = int(lookup_poinf.institute_class.loc[[str(node)]])
nlist[c].append(str(node))
# sort the list according to the size of the list, so that larger circles are outside.
nlist_dict = {len(a): [] for a in nlist}
for lst in nlist:
nlist_dict[len(lst)].extend(lst)
length = list(nlist_dict.keys())
# length.sort(reverse=True)
_nlist = []
for l in length:
_nlist.append(nlist_dict[l])
_nlist = [a for a in _nlist if len(a) > 0]
nx.draw_networkx(
g,
# pos=nx.shell_layout(g, _nlist),
pos=pos_full,
with_labels=False,
ax=ax,
# edge_color='#999966',
node_size=60,
node_color=color_by_inst(g, lookup_poinf))
# nx.draw_networkx(
# g,
# pos=pos,
# with_labels=False,
# ax=ax,
# node_size=40,
# nodelist=g.nodes,
# node_color=color_by_inst(g))
if SAVE_GRAPHS:
# fig.suptitle('Evolution of Informatics Collaboration Network (1997-2017)')
# plt.savefig("IMG/infnet20yr_evolution.pdf", format='pdf', bbox_inches="tight")
fig.savefig(
"IMG/infnet20yr_evolution.png",
format='png',
bbox_inches="tight",
transparent=True)
fig = plt.figure(figsize=(50, 10))
i = 1
for yr in years[1:]:
df = lookup_poinf.drop(
lookup_poinf[lookup_poinf.first_pub_year >= yr].index)
individuals = list(df.index)
g = g_infnet20yr.subgraph(individuals)
ax = fig.add_subplot(1, 4, i)
# ax.set_title('End of {}'.format(yr))
ax.axis('off')
i += 1
draw_circular_layout(g, lookup_poinf, file_prefix='evol_{}'.format(i), SAVE_GRAPHS=False)
nx.degree_histogram(g_infnet20yr)
ax, degree_seq = degree_dist(g_infnet20yr)
print(degree_seq[0]+degree_seq[1])
# we can observe the power-law fit to our degree distribution:
# $F(d) = (\frac{d}{d_{min}})^{-(\alpha-1)}$
fig = power_law_fit(degree_seq)
if SAVE_GRAPHS:
plt.savefig("IMG/infnet20yr_degreeDist.pdf", format='pdf', bbox_inches="tight")
ideally, a log-log plot on the ccdf with power-law should yield a straight line
cc = clustering_coeff(g_infnet20yr)
print('average clustering coefficient: ', cc[1])
nx.average_clustering(g_infnet20yr,count_zeros=False)
# The number of triangles for each nodes
transitivity_graph = nx.transitivity(g_infnet20yr)
print('Transitivity:', transitivity_graph)
gccs, percentage = generateGCC(g_infnet20yr)
# display the connected components:
fig = plt.figure(figsize=(10, 10))
num_col = len(gccs) / 2
for i, g in enumerate(gccs, 1):
percent = percentage[i - 1]
ax = fig.add_subplot(num_col, 2, i)
ax.axis('off')
ax.set_title('Component {} ({:.2%})'.format(i, percent))
nx.draw_networkx(
g,
node_color=color_by_inst(g,lookup_poinf),
ax=ax,
with_labels=False,
node_size=30,
edge_color='#999966',
pos=pos_full)
if SAVE_GRAPHS:
plt.savefig("IMG/infnet20yr_CC.pdf", format='pdf', bbox_inches="tight")
Now, we analyse the giant connected component from the network
main_gcc = gccs[0] # First element is our GCC as we have sorted in reverse
pos_gcc = nx.spring_layout(main_gcc)
print('number of nodes in largest connected component:', len(main_gcc))
print(nx.info(main_gcc))
# Draw the Network
fig = plt.figure(figsize=(8, 8))
ax = fig.add_subplot(111)
ax = add_inst_labels(ax)
ax.axis('off')
nx.draw_networkx(
main_gcc,
pos=pos_gcc,
with_labels=False,
ax=ax,
node_size=40,
edge_color='#999966',
node_color=color_by_inst(main_gcc, lookup_poinf))
ax.set_title('Largest connected component ({:.2%})'.format(percentage[0]))
if SAVE_GRAPHS:
plt.savefig("IMG/infnet20yr_LargestCC.pdf", format='pdf', bbox_inches="tight")
ax, degree_seq = degree_dist(main_gcc)
fig = power_law_fit(degree_seq)
if SAVE_GRAPHS:
plt.savefig(
"IMG/infnet20yr_LargestCC-degreeDist.pdf", format='pdf', bbox_inches="tight")
cc = clustering_coeff(main_gcc)
print('average clustering coefficient: ', cc[1])
nx.average_shortest_path_length(main_gcc) # AVERAGE PATH
# Diameter:
print('Diameter of graph:', nx.diameter(main_gcc))
import community
# Community detection using modularity
parts = community.best_partition(main_gcc)
values = [parts.get(node) for node in main_gcc.nodes()]
# assign each node to the community they belong to
# Plot the networks side by side:
fig = plt.figure(figsize=(20, 10))
fig.suptitle(
'Comparison between actual institutes and communities detected\n(Colors of nodes for graph on right is independent of those on left)'
)
ax1 = fig.add_subplot(121)
ax1.set_title('Actual communities in informatics collaboration network')
ax1.axis('off')
nx.draw_networkx(
main_gcc,
pos=pos_gcc,
with_labels=False,
ax=ax1,
node_size=40,
node_color=color_by_inst(main_gcc))
ax2 = fig.add_subplot(122)
ax2.set_title('Communities detected using modularity')
ax2.axis('off')
nx.draw_networkx(
main_gcc,
pos=pos_gcc,
cmap=plt.get_cmap("rainbow"),
ax=ax2,
node_color=values,
node_size=40,
with_labels=False)
plt.savefig(
"IMG/infnet_LargestCC_commCompare.pdf", format='pdf', bbox_inches="tight")
num_comm = len(set(parts.values()))
print('Number of communities detected =', num_comm)
# plot each community:
fig = plt.figure(figsize=(10, 30))
comms = set(parts.values())
for i, comm_id in enumerate(comms, 1):
ax = fig.add_subplot(6, 2, i)
# find those nodes belonging to this community:
nodes_from_comm = [
node_id for (node_id, c) in parts.items() if c == comm_id
]
# Generate the subgrph belonging to these nodes:
subG = nx.subgraph(main_gcc, nodes_from_comm)
ax.axis('off')
ax.set_title('Community {}'.format(comm_id))
nx.draw_networkx(
subG,
pos=pos_gcc,
ax=ax,
node_color=color_by_inst(nodes_from_comm),
node_size=20,
with_labels=False)
ax = add_inst_labels(ax)
plt.savefig(
"IMG/infnet_LargestCC_communities.pdf", format='pdf', bbox_inches="tight")
Using betweness centrality as a mean to measure influence of node in the network
bt = between_parallel(main_gcc)
top = 11
# we need to find the index of these max_nodes:
_nodes = list(main_gcc.nodes())
max_nodes = sorted(bt.items(), key=lambda v: -v[1])[:top]
max_nodes
# variables for plotting the network: values tell nx how big each node should be
bt_values = [10] * len(main_gcc.nodes())
bt_colors = ['xkcd:black'] * len(main_gcc.nodes())
for max_key, max_val in max_nodes:
bt_values[_nodes.index(max_key)] = (
max_val * 150)**2.2 # SCALE IT ACCORDINGLY
bt_colors[_nodes.index(max_key)] = inst_by_color[int(
pd_poinf.institute_class.loc[[str(max_key)]])]
fig = plt.figure(figsize=(10, 10))
ax = fig.add_subplot(111)
plt.axis("off")
plt.suptitle('The top 11 influential individuals in the GCC')
nx.draw_networkx(
main_gcc,
pos=pos_gcc,
ax=ax,
node_color=bt_colors,
node_size=bt_values,
with_labels=False)
plt.savefig("IMG/infnet_influencer.pdf", format='pdf', bbox_inches="tight")
top_ids = [a[0] for a in max_nodes]
pd_poinf.loc[top_ids]
infnet-6yr Model :: 2012-2017¶# What are the years?
print(sorted(pd.unique(lookup_pub.year), reverse=True))
# There are some dates instead of just purely years.
# they all fall out of the 6 year period that we are interestted in (2017-2012)
# Grouping the publications by year:
gb = lookup_pub.groupby('year')
sixYears = [2017, 2016, 2015, 2014, 2013, 2012]
pd_years = {}
print("Year:\tPub count")
for yr, group in gb:
if yr in sixYears:
pd_years[yr] = group
print(("{}:\t{}".format(yr, len(group))))
combined_yrs = pd.concat(pd_years.values())
print("Total publications: ", len(combined_yrs))
all_edges_6yr = []
for listEdges in combined_yrs.edges:
all_edges_6yr.extend(listEdges)
print('total number of edges: ', len(all_edges_6yr))
# Since the graph of interest is simple ud, we remove all repeated edges
unique_edges_6yr = set()
for (i1, i2) in all_edges_6yr:
assert i1 != i2, "SELF LOOPS DETECTED!"
# We are only interested in an simple undirected graph!
# if the same edge exists, we can ignore
if (i2, i1) in unique_edges_6yr:
continue
else:
# adding another (i1,i2) into the set will only count as one
unique_edges_6yr.add((i1, i2))
print('number of unique edge pairs: ', len(unique_edges_6yr))
# Filter all edges pairs if there is only one or none individuals from school of informatics
poinf6yr_edges_only = [(au1, au2) for (au1, au2) in unique_edges_6yr
if au1 in lookup_poinf.index and au2 in lookup_poinf.index]
print('size of collaboration network (number of edges): ',
len(poinf6yr_edges_only))
Note the large decrease in edges with and without external individuals (from 19,890 to 360)
Generating infnet-6yr
g_infnet6yr = nx.from_edgelist(poinf6yr_edges_only)
pos_6yr = nx.spring_layout(g_infnet6yr)
print(nx.info(g_infnet6yr))
NODES_ORDER = []
with open(os.path.join(DATA_DIR, 'poinf_collabgraph_1997-2017.txt'), 'r') as f:
for node in f:
NODES_ORDER.append(node.strip())
len(NODES_ORDER)
adj_mat_6yr, fig, order_6yr = create_adj_mat(
g_infnet6yr, NODES_ORDER, draw=True, use_order=False)
if SAVE_GRAPHS:
plt.savefig(
'IMG/infnet6yr_adj_mat_order6yr.png',
# 'IMG/infnet6yr_adj_mat_order20yr.png',
format='png',
transparent=True,
bbox_inches='tight')
print((adj_mat_6yr.shape))
print((len(order_6yr)))
with open(os.path.join(DATA_DIR, 'poinf_collabgraph_2012-2017.txt'), 'w') as f:
f.write("\n".join(list(order_6yr)))
# adj_mat.dump(os.path.join(DATA_DIR,'mat','infnet6yrs-adj-mat.order20yr.pkl'))
adj_mat_6yr.dump(os.path.join(DATA_DIR, 'mat', 'infnet6yrs-adj-mat.order6yr.pkl'))
f = plt.figure(figsize=(8,8))
ax=f.add_subplot(111)
nx.draw(
g_infnet6yr,
pos=pos_6yr,
node_size=30,
ax=ax,
node_color=color_by_inst(g_infnet6yr, lookup_poinf))
f.tight_layout()
f.savefig('IMG/infnet6yr_spring.png',format='png', bbox_inches='tight')
f = plt.figure(figsize=(8,8))
ax=f.add_subplot(111)
nx.draw(
g_infnet6yr,
pos=pos_full,
node_size=50,
alpha=.8,
edge_color='#999966',
node_color=color_by_inst(g_infnet6yr, lookup_poinf))
f.tight_layout()
f.savefig('IMG/infnet6yr_spring_winfnet20yrpos.png',format='png',bbox_inches='tight')
Circular layout for each class¶draw_default_layout(g_infnet6yr, lookup_poinf, 'infnet6yr')
Circular Layout¶draw_circular_layout(g_infnet6yr, lookup_poinf, file_prefix='infnet6yr', SAVE_GRAPHS=False)
Shell Layout¶nlist = [[] for i in range(12)]
for node in g_infnet6yr:
c = int(lookup_poinf.institute_class.loc[[str(node)]])
nlist[c].append(str(node))
# sort the list according to the size of the list, so that larger circles are outside.
nlist_dict = {len(a): [] for a in nlist}
for lst in nlist:
nlist_dict[len(lst)].extend(lst)
length = list(nlist_dict.keys())
# length.sort(reverse=True)
_nlist = []
for l in length:
_nlist.append(nlist_dict[l])
_nlist = [a for a in _nlist if len(a) > 0]
# visualise:
fig = plt.figure(figsize=(8, 8))
ax = fig.add_subplot(111)
ax.axis('off')
nx.draw_networkx(
g_infnet6yr,
pos=nx.shell_layout(g_infnet6yr, _nlist),
with_labels=False,
ax=ax,
edge_color='#999966',
node_size=40,
node_color=color_by_inst(g_infnet6yr,lookup_poinf))
# if SAVE_GRAPHS:
# ax.set_title('Informatics Collaboration Network from 1997-2017')
# ax = add_inst_labels(ax) #You can include the label by uncommenting `ax = add_inst_label(ax)`
# plt.savefig("IMG/infnet6yr_shell.pdf", format='pdf', bbox_inches="tight")
# plt.savefig(
# "IMG/infnet6yr_shell.png",
# format='png',
# bbox_inches="tight",
# transparent=True,
# )
print(nx.info(g_infnet6yr))
nx.degree_histogram(g_infnet6yr)
ax, degree_seq = degree_dist(g_infnet6yr)
print(degree_seq[0], degree_seq[1])
# $F(d) = (\frac{d}{d_{min}})^{-(\alpha-1)}$
fig = power_law_fit(degree_seq)
if SAVE_GRAPHS:
plt.savefig("IMG/infnet6yr_degreeDist.pdf", format='pdf', bbox_inches="tight")
cc = clustering_coeff(g_infnet6yr)
print('average clustering coefficient: ', cc[1])
nx.average_clustering(g_infnet6yr,count_zeros=False)
# The number of triangles for each nodes
transitivity_graph = nx.transitivity(g_infnet6yr)
print('Transitivity:', transitivity_graph)
gccs, percentage = generateGCC(g_infnet6yr)
# display the connected components:
fig = plt.figure(figsize=(10, 10))
num_col = len(gccs) / 2
for i, g in enumerate(gccs, 1):
percent = percentage[i - 1]
ax = fig.add_subplot(num_col, 2, i)
ax.axis('off')
ax.set_title('Component {} ({:.2%})'.format(i, percent))
nx.draw_networkx(
g,
node_color=color_by_inst(g, lookup_poinf),
ax=ax,
with_labels=False,
node_size=40,
pos=pos_6yr)
if SAVE_GRAPHS:
plt.savefig("IMG/infnet6yr_CC.pdf", format='pdf', bbox_inches="tight")
main_gcc = gccs[0]
pos_gcc = nx.spring_layout(main_gcc)
# Draw the graph:
fig = plt.figure(figsize=(8, 8))
ax = fig.add_subplot(111)
# ax = add_inst_labels(ax)
ax.axis('off')
nx.draw_networkx(
main_gcc,
pos=pos_full,
# pos=pos_gcc,
with_labels=False,
ax=ax,
node_size=40,
node_color=color_by_inst(main_gcc, lookup_poinf)
)
if SAVE_GRAPHS:
ax.set_title('Largest connected component ({:.2%})'.format(percentage[0]));
plt.savefig("IMG/infnet6yr_LargestCC.pdf", format='pdf', bbox_inches="tight")
print('number of nodes in largest connected component:', len(main_gcc))
print(nx.info(main_gcc))
ax, degree_seq = degree_dist(main_gcc)
# $F(d) = (\frac{d}{d_{min}})^{-(\alpha-1)}$
fig = power_law_fit(degree_seq)
if SAVE_GRAPHS:
plt.savefig("IMG/infnet6yr_LargestCC_degreeDist.pdf", format='pdf',
bbox_inches="tight")
cc = clustering_coeff(main_gcc)
print('average clustering coefficient: ', cc[1])
nx.average_shortest_path_length(main_gcc) # AVERAGE PATH
# Diameter:
print('Diameter of graph:', nx.diameter(main_gcc))
# Community detection using modularity
parts = community.best_partition(main_gcc)
values = [parts.get(node) for node in main_gcc.nodes()]
# assign each node to the community they belong to
# Plot the graphs side by side:
fig = plt.figure(figsize=(20, 10))
fig.suptitle('Comparison between actual institutes and communities detected\n(Colors of nodes for graph on right is independent of those on left)')
ax1 = fig.add_subplot(121)
ax1.set_title('Actual communities in informatics collaboration network')
ax1.axis('off')
nx.draw_networkx(
main_gcc,
pos=pos_gcc,
with_labels=False,
ax=ax1,
node_size=40,
node_color=color_by_inst(main_gcc)
)
ax2 = fig.add_subplot(122)
ax2.set_title('Communities detected using modularity')
ax2.axis('off')
nx.draw_networkx(
main_gcc,
pos=pos_gcc,
cmap=plt.get_cmap("rainbow"),
ax=ax2,
node_color=values,
node_size=40,
with_labels=False)
plt.savefig("IMG/infnet6yr_LargestCC_commCompare.pdf", format='pdf',
bbox_inches="tight")
num_comm = len(set(parts.values()))
print('Number of communities detected =', num_comm)
# plot each community:
fig = plt.figure(figsize=(10,30))
comms = set(parts.values())
for i, comm_id in enumerate(comms, 1):
ax = fig.add_subplot(6,2,i)
# find those nodes belonging to this community:
nodes_from_comm = [node_id for (node_id, c) in parts.items() if c == comm_id]
# Generate the subgrph belonging to these nodes:
subG = nx.subgraph(main_gcc, nodes_from_comm)
ax.axis('off')
ax.set_title('Community {}'.format(comm_id))
nx.draw_networkx(
subG,
pos=pos_gcc,
ax=ax,
node_color=color_by_inst(nodes_from_comm),
node_size=40,
with_labels=False)
ax = add_inst_labels(ax)
plt.savefig("IMG/infnet6yr_LargestCC_communities.pdf", format='pdf',
bbox_inches="tight")
bt = between_parallel(main_gcc)
top = 11
# we need to find the index of these max_nodes:
_nodes = list(main_gcc.nodes())
max_nodes = sorted(bt.items(), key=lambda v: -v[1])[:top]
max_nodes
bt_values = [10] * len(main_gcc.nodes())
bt_colors = ['xkcd:black'] * len(main_gcc.nodes())
for max_key, max_val in max_nodes:
bt_values[_nodes.index(max_key)] = (max_val * 150)**2.2
bt_colors[_nodes.index(max_key)] = inst_by_color[int(
pd_poinf.institute_class.loc[[str(max_key)]])]
fig = plt.figure(figsize=(10,10))
ax = fig.add_subplot(111)
plt.axis("off")
plt.suptitle('The top 11 influential individuals in the GCC')
nx.draw_networkx(
main_gcc,
pos=pos_gcc,
ax=ax,
node_color=bt_colors,
node_size=bt_values,
with_labels=False)
plt.savefig("IMG/infnet6yr_influencer.pdf", format='pdf',
bbox_inches="tight")
top_ids = [a[0] for a in max_nodes]
pd_poinf.loc[top_ids]
Weighted 6yr Model¶IN this section, we explore a version of the weight simple undirected grap of the 6yr model. The calculation of the weight of an edge is the sum of contributions by each author pair for all their publications (publication which involve both of them (and others).
Contributions are calculated as $$\text{contribution}(u,v) = \frac{1}{\text{size of complete graph}} = \frac{1}{{N}\choose{2}} = \frac{1}{\frac{N(N-1)}{2}} = \frac{2}{N(N-1)}$$ where $N$ is the number of authors for a publication.
Weight of edge between authors $u$ and $v$ $$weight(u,v) = \sum_{i=1}^{N}\text{contribution}_{i}(u,v)$$
def get_edge_weight(authors):
# function to calculate the edge weight for each publication
if len(authors) > 1:
n = len(authors)
k = 2. / (n * (n - 1))
else:
k = 0.
return k
combined_yrs['weight'] = combined_yrs.apply(
lambda row: get_edge_weight(row.collab_id), axis=1)
# Add the edges to the network with weight attribute
g_poinf_weighted = nx.Graph()
for row in combined_yrs.itertuples():
edgelist = row.edges
weight = row.weight
for (a, b) in edgelist:
if a in lookup_poinf.index and b in lookup_poinf.index:
g_poinf_weighted.add_edge(a, b, weight=weight)
edgewidth = [
d['weight'] * 2. for (u, v, d) in g_poinf_weighted.edges(data=True)
] # weights for each edges
print(nx.info(g_poinf_weighted))
assert not set(g_infnet6yr.nodes).difference(
set(g_poinf_weighted.nodes
)), "Same number of nodes for both 6yr model should be observer"
_NODES_ORDER = open(os.path.join(DATA_DIR,'poinf_collabgraph_1997-2017.txt'), 'r').readlines()
NODES_ORDER = [n.strip() for n in _NODES_ORDER]
# To ensure that the id corresponding to each individual is the same as for those,
# First, create a dump of the id that corresponds to the list of individual:
adj_mat, fig, order = create_adj_mat(
g_poinf_weighted, NODES_ORDER, draw=True, use_order=False, weighted=True)
print(adj_mat.shape)
print(len(order))
with open(os.path.join(DATA_DIR, 'poinf_collabgraph_2012-2017_weighted.txt'), 'w') as f:
f.write("\n".join(list(order)))
# save the weighted matrix:
adj_mat.dump(os.path.join(DATA_DIR,'mat','infnet6yrs-weighted-adj-mat.order6yr.pkl'))
# Take a look at the distribution of edgewidth
fig = plt.hist(edgewidth, bins=100)
# plt.savefig('IMG/infnet6yrw_tofWeights.pdf', format='pdf', bbbox_inches="tight")
Circular layout for each class¶draw_default_layout(
g_poinf_weighted,
lookup_poinf,
'infnet6yrw',
with_weight=True,
scale=3,
SAVE_GRAPHS=True)
Circular Layout¶edges_across = []
for (a, b) in g_poinf_weighted.edges:
c_a = int(lookup_poinf.institute_class.loc[[str(a)]])
c_b = int(lookup_poinf.institute_class.loc[[str(b)]])
if c_a != c_b:
if (b, a) not in edges_across:
edges_across.append((a, b))
draw_circular_layout(
g_poinf_weighted, lookup_poinf, file_prefix='infnet6yrw', with_weight=True, scale=5, SAVE_GRAPHS=True)
_join_names = lambda x_y: " ".join([x_y[0], x_y[1]])
lookup_poinf['full_name'] = list(
map(_join_names, list(zip(lookup_poinf['first_name'], lookup_poinf['last_name']))))
pos = nx.spring_layout(g_poinf_weighted, k=.14)
g=g_poinf_weighted
scale=5
edgewidth = [d['weight'] * float(scale) for (u, v, d) in g.edges(data=True)]
# pos = nx.kamada_kawai_layout(g_poinf_weighted, pos=pos,scale=10)
f= plt.figure(figsize=(20,18))
ax=f.add_subplot(111)
label_dict = {}
for n in g_poinf_weighted.nodes:
label_dict[n] = lookup_poinf.loc[n].full_name
nx.draw(
g_poinf_weighted,
pos=pos,
ax=ax,
width=edgewidth,
node_size=80,
edge_color='#999966',
node_color=color_by_inst(g_poinf_weighted, lookup_poinf))
add_inst_labels(ax)
ax.legend(
loc='upper center',
bbox_to_anchor=(0.5, 1.05),
ncol=3,
title='institutes',
fontsize=12,
fancybox=True,
shadow=False)
nx.draw_networkx_labels(
g_poinf_weighted, pos=pos, labels=label_dict, ax=ax,font_size=12.5, font_weight='bold');
f.tight_layout()
f.savefig('IMG/infnet6yrw_spring_wlabels.pdf',format='pdf',bbox_inches='tight')
Spring layout¶# Draw the graph:
fig = plt.figure(figsize=(18, 18))
ax = fig.add_subplot(111)
# ax=add_inst_labels(ax)
ax.axis('off')
nx.draw_networkx_nodes(
g_poinf_weighted,
pos=pos,
with_labels=False,
ax=ax,
node_size=90,
node_color=color_by_inst(g_poinf_weighted, lookup_poinf))
nx.draw_networkx_edges(
g_poinf_weighted,
pos,
width=edgewidth,
)
fig.tight_layout()
fig.savefig('IMG/infnet6yrw_springv2.pdf',format='pdf',bbox_inches='tight')
# Draw the graph:
fig = plt.figure(figsize=(8, 8))
ax = fig.add_subplot(111)
# ax=add_inst_labels(ax)
ax.axis('off')
nx.draw_networkx_nodes(
g_poinf_weighted,
pos=pos_full,
with_labels=False,
ax=ax,
node_size=20,
node_color=color_by_inst(g_poinf_weighted, lookup_poinf))
nx.draw_networkx_edges(
g_poinf_weighted,
pos_full,
width=edgewidth,
)
fig.savefig('IMG/infnet6yrw_withinfnet20yrpos.png',format='png',bbox_inches='tight')
Degree distribution is the same as 3.2.1
print('Average clustering coeff: ',
nx.average_clustering(g_poinf_weighted, weight='weight'))
This is lower than the unweighted graph ( 0.421964851597)!
gccs, percentage = generateGCC(g_poinf_weighted)
# display the connected components:
fig = plt.figure(figsize=(10, 10))
num_col = len(gccs) / 2
for i, g in enumerate(gccs, 1):
percent = percentage[i - 1]
ax = fig.add_subplot(num_col, 2, i)
ax.axis('off')
ax.set_title('Component {} ({:.2%})'.format(i, percent))
nx.draw_networkx_nodes(
g,
node_color=color_by_inst(g, lookup_poinf),
ax=ax,
with_labels=False,
node_size=20,
pos=pos_6yr)
g_edgewidth = [d['weight'] for (u, v, d) in g.edges(data=True)]
nx.draw_networkx_edges(g, pos_6yr, width=np.multiply(g_edgewidth, 3))
plt.savefig("IMG/infnetweighted_CC.pdf", format='pdf', bbox_inches="tight")
main_gcc = gccs[0]
print('number of nodes in largest connected component:', len(main_gcc))
print(nx.info(main_gcc))
# Draw the graph:
fig = plt.figure(figsize=(8, 8))
ax = fig.add_subplot(111)
# ax = add_inst_labels(ax)
ax.axis('off')
nx.draw_networkx_nodes(
main_gcc,
pos=pos_gcc,
with_labels=False,
ax=ax,
node_size=20,
node_color=color_by_inst(main_gcc, lookup_poinf))
main_gcc_edgewidth = [d['weight'] for (u, v, d) in main_gcc.edges(data=True)]
nx.draw_networkx_edges(
main_gcc, pos_gcc, width=np.multiply(main_gcc_edgewidth, 3))
ax.set_title('Largest connected component ({:.2%})'.format(percentage[0]))
if SAVE_GRAPHS:
plt.savefig(
"IMG/infnet6yrw_LargestCC.pdf", format='pdf', bbox_inches="tight")
print('Average clustering coeff: ',
nx.average_clustering(main_gcc, weight='weight'))
print('Average shortest path length: ',
nx.average_shortest_path_length(main_gcc, weight='weight'))
# Community detection using modularity
parts = community.best_partition(main_gcc)
values = [parts.get(node) for node in main_gcc.nodes()]
# assign each node to the community they belong to
print('Number of communities detected: ', len(set(values)))
# Plot the graphs side by side:
fig = plt.figure(figsize=(20, 10))
fig.suptitle(
'Comparison between actual institutes and communities detected\n(Colors of nodes for graph on right is independent of those on left)'
)
ax1 = fig.add_subplot(121)
ax1.set_title('Actual communities in informatics collaboration network')
ax1.axis('off')
nx.draw_networkx_nodes(
main_gcc,
pos=pos_gcc,
with_labels=False,
ax=ax1,
node_size=20,
node_color=color_by_inst(main_gcc))
nx.draw_networkx_edges(
main_gcc, pos_gcc, width=np.multiply(main_gcc_edgewidth, 3))
ax2 = fig.add_subplot(122)
ax2.set_title('Communities detected using modularity')
ax2.axis('off')
nx.draw_networkx_nodes(
main_gcc,
pos=pos_gcc,
cmap=plt.get_cmap("rainbow"),
ax=ax2,
node_color=values,
node_size=20,
with_labels=False)
nx.draw_networkx_edges(
main_gcc, pos_gcc, width=np.multiply(main_gcc_edgewidth, 3))
plt.savefig(
"IMG/infnetweight_LargestCC_commCompare.pdf",
format='pdf',
bbox_inches="tight")
We use the top scores for the eigenvector centrality to determine who are the most influential individuals
evect_centrality = nx.eigenvector_centrality_numpy(main_gcc, weight='weight')
max_nodes = sorted(evect_centrality.items(), key=lambda v: -v[1])[:top]
max_nodes
bt_values = [10] * len(main_gcc.nodes())
bt_colors = ['xkcd:black'] * len(main_gcc.nodes())
for max_key, max_val in max_nodes:
bt_values[_nodes.index(max_key)] = (max_val * 100)**2
bt_colors[_nodes.index(max_key)] = inst_by_color[int(
pd_poinf.institute_class.loc[[str(max_key)]])]
fig = plt.figure(figsize=(10, 10))
ax = fig.add_subplot(111)
plt.axis("off")
plt.suptitle('The top 11 influential individuals in the GCC')
nx.draw_networkx_nodes(
main_gcc,
pos=pos_gcc,
ax=ax,
node_color=bt_colors,
node_size=bt_values,
with_labels=False)
nx.draw_networkx_edges(
main_gcc, pos_gcc, width=np.multiply(main_gcc_edgewidth, 3))
plt.savefig(
"IMG/infnetweight_influencer.pdf", format='pdf', bbox_inches="tight")
top_ids = [a[0] for a in max_nodes]
pd_poinf.loc[top_ids]